#####Note: COVID period is classed from March 2020 - June 2021, theriod between when lockdown was introduced, and when restrictions were fully lifted
library(tidyverse)
library(janitor) # cleaning data
library(gt) # tables
library(here) # directory structure
library(readxl) # read in excel file
library(sf) #to read in map data
library(plotly) #to display information when plot is hovered over
#Get data from 24-month period from Mar 2020 - Feb 2022
#create function to read data in:
prescriptions_data <- function() {
files <- c(paste0(c("jan", "feb","mar", "apr", "may", "jun", "jul", "aug", "sep", "oct", "nov", "dec"), "2020"),
paste0(c("jan", "feb", "mar", "apr", "may", "jun", "jul", "aug", "sep", "oct", "nov", "dec"), "2021"))
map_dfr(files, ~read_csv(here("data", paste0(.x, ".csv"))) %>%
clean_names() %>%
mutate(file_name = .x))}
# Read in data
data_for_years <- prescriptions_data()
deprivation_stats <- read_excel(here("data/SIMD.xlsx"))
#Read in and tidy healthboard population database
population_data <- read_csv(here("data/general_health_census.csv"), skip = 10) %>%
filter(!is.na('All people')) %>% #Removes NA
rename(hb_name = "General health",
hb_population = "All people") %>%
select(hb_name, hb_population) %>%
filter(!str_detect(hb_population, "Cells")) #Removes rows with text in population (to remove copyright info)
diabetes_summary <- data_for_years %>%
#filter for diabetes medications
filter(!is.na(bnf_item_description)) %>%
filter(str_detect(bnf_item_description,
regex("metformin|gliptin|gliflozin|glutide|glitazone|gliclazide|glimepiride", ignore_case = TRUE))) %>%
group_by(paid_date_month, hbt) %>%
summarise(paid_quantity = sum(paid_quantity, na.rm = TRUE)) %>%
#change months from numbers to letters
mutate(paid_date_month = ym(paid_date_month))
# Merge all tables together
combined_health_data <- deprivation_stats %>%
full_join(diabetes_summary, join_by(HBcode == hbt), relationship = "many-to-many") %>%
left_join(population_data, join_by(HBname == hb_name))
# Now calculate prescriptions per person
combined_health_data <- combined_health_data %>%
group_by(HBname, paid_date_month) %>%
mutate(quantity_per_head = sum(paid_quantity, na.rm = TRUE) / mean(as.numeric(hb_population))) #calculate prescriptions per person and change hb_population to numeric value instead of character
# Creating a summary table and filtering for common type 2 medications
totalmeds_plot <- combined_health_data %>%
ggplot(aes(x = paid_date_month, y = quantity_per_head)) +
# Add vertical lines for COVID start and end
geom_vline(xintercept = as.numeric(ymd("2020-03-01")),
linetype = "dashed", colour = "purple", size = 0.8) +
geom_vline(xintercept = as.numeric(ymd("2021-06-01")),
linetype = "dashed", colour = "purple", size = 0.8) +
geom_line(colour = "darkblue", size = 0.5) + # Make original line lighter
labs(
title = "Prescription of type 2 dabetes medications per person",
subtitle = "January 2020 - December 2021 | Purple dashed lines: COVID period (Mar 2020 - Jun 2021)",
x = "Month",
y = "Quantity per head",
) +
theme_light() +
theme(axis.text.x = element_text(angle = 45, hjust=1)) +
facet_wrap(~HBname, scales = "free_y")
#labeller = label_wrap_gen(width = 10) ADD TO FACET WRAP
print(totalmeds_plot)
#MENTION FREED SCALES as there were vast differences in prescription across healthboards
key_months_data <- combined_health_data %>%
filter(paid_date_month %in% as.Date(c("2020-01-01", "2021-12-01"))) %>%
group_by(HBname) %>%
mutate(covid_phase = case_when(
paid_date_month == as.Date("2020-01-01") ~ "pre_covid",
paid_date_month == as.Date("2021-12-01") ~ "post_covid"))
# Summarise by health board and COVID phase
lollipop_data <- key_months_data %>%
group_by(HBname, covid_phase) %>%
summarise(avg_qph = mean(quantity_per_head, na.rm = TRUE),
avg_simd = mean(SIMD2020v2_Decile, na.rm = TRUE)) %>%
# Pivot to get Pre and Post columns for connecting lines
pivot_wider(names_from = covid_phase, values_from = avg_qph) %>%
mutate(percent_change = ((post_covid - pre_covid)/pre_covid)*100) %>% #to calculate percent change
# Arrange by descending avg_simd
arrange(desc(avg_simd)) %>%
ungroup() %>% #before creating a factor so hat HBname can be arranged in order of avg_simd in graph
# Create factor to preserve ordering in plot
mutate(HBname = factor(HBname, levels = HBname))
lollipop_graph <- lollipop_data %>%
ggplot() +
geom_segment(aes(x = HBname, xend = HBname, y = pre_covid, yend = post_covid),
color = "grey") +
geom_point(aes(x = HBname, y = pre_covid),
color = rgb(0.2, 0.7, 0.1, 0.5), size = 3) +
geom_point(aes(x = HBname, y = post_covid),
color = rgb(0.7, 0.2, 0.1, 0.5), size = 3) +
coord_flip() +
theme_minimal() +
theme(legend.position = "none") +
xlab("") +
ylab("Average Quantity Per Head") +
ggtitle("Change in Quantity Per Head: Pre-COVID vs Post-COVID",
subtitle = "NHS Health Boards ordered by average SIMD decile (descending)")
# Display the plot
print(lollipop_graph)
# load the NHS Health board Shapefile
NHS_healthboards <- st_read(here("data/Week6_NHS_HealthBoards_2019.shp"))
## Reading layer `Week6_NHS_healthboards_2019' from data source
## `/Users/sekemiadenuga/data-science/B248209/data/Week6_NHS_healthboards_2019.shp'
## using driver `ESRI Shapefile'
## Simple feature collection with 14 features and 4 fields
## Geometry type: MULTIPOLYGON
## Dimension: XY
## Bounding box: xmin: 7564.996 ymin: 530635.8 xmax: 468754.8 ymax: 1218625
## Projected CRS: OSGB36 / British National Grid
#Join spatial data with other health data
summary_map <- NHS_healthboards %>%
full_join(lollipop_data, by = join_by(HBName == HBname))
# Plot percentage change map
plot_map <- summary_map %>%
ggplot() +
geom_sf(aes(fill = percent_change, text = paste0("Health Board: ", HBName, "\n", round(percent_change,2), "%")), color = "white", size = 0.3) +
scale_fill_viridis_c(option = "plasma",
name = "% Change in Average Quantity per Head") +
labs(
title = "Percentage Change in Average Type 2 Diabetes Prescriptions Per Head \nPre-COVID → Post-COVID",
caption = "Data from Jan 2020 and Dec 2021"
) +
theme_minimal() +
theme(legend.position = "right", plot.title = element_text(face = "bold"))
interactive_map <- ggplotly(plot_map, tooltip = "text")
interactive_map
ranked_table <- lollipop_data %>% # replace with your dataframe name
arrange(avg_simd) %>%
mutate(Rank = row_number()) %>%
select(c(Rank, HBname, pre_covid, percent_change))
# ---- Step 2: GT table ----
ranked_table %>%
gt() %>%
cols_label(
Rank = "Rank",
HBname = "Health Board",
pre_covid = "Pre-COVID Level (per head)",
#absolute_change = "Absolute Increase",
percent_change = "Percentage Increase (%)"
) %>%
fmt_number(
columns = c(pre_covid),
decimals = 2
) %>%
fmt_number(
columns = percent_change,
decimals = 1
) %>%
tab_header(
title = " Health Boards by Percentage Increase in Type 2 Diabetes Prescriptions",
subtitle = "Comparing pre-COVID (Jan 2020) to post-COVID (Dec 2021)"
)
| Health Boards by Percentage Increase in Type 2 Diabetes Prescriptions | |||
| Comparing pre-COVID (Jan 2020) to post-COVID (Dec 2021) | |||
| Rank | Health Board | Pre-COVID Level (per head) | Percentage Increase (%) |
|---|---|---|---|
| 1 | Ayrshire and Arran | 1,980.79 | 18.8 |
| 2 | Lanarkshire | 3,034.89 | 17.8 |
| 3 | Greater Glasgow and Clyde | 4,991.68 | 19.3 |
| 4 | Western Isles | 107.14 | 29.0 |
| 5 | Dumfries and Galloway | 700.08 | 18.0 |
| 6 | Fife | 1,524.35 | 18.0 |
| 7 | Tayside | 1,526.62 | 21.1 |
| 8 | Highland | 1,289.34 | 23.5 |
| 9 | Forth Valley | 1,454.79 | 20.8 |
| 10 | Borders | 468.36 | 20.0 |
| 11 | Orkney | 97.10 | 19.2 |
| 12 | Lothian | 2,941.78 | 24.3 |
| 13 | Shetland | 87.49 | 14.4 |
| 14 | Grampian | 2,040.41 | 22.7 |
```
##MAKE DATA PER 100,000 PEOPLE ++ Add what line says, + DOUBLE CHECK LABELS + HOVER OVER MAP SHOULD SAY HEALTHBOARD NAME